You'll have to run all the cells on the a notebook server to see ALL the outputs!! To do so, get all the data necessary:
!pip install PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"1iYE4FC21KcEcrluQvGduImMZPjolf2tD"}) # replace the id with id of file you want to access
downloaded.GetContentFile('data_4q .csv')
downloaded2 = drive.CreateFile({'id':"1olfnp-PAi-NJXtgoguWsqIvQLZK_kkiy"}) # replace the id with id of file you want to access
downloaded2.GetContentFile('df_atp.csv')
!pip install facets-overview
# Load UCI census train and test data into dataframes.
import pandas as pd
df_atp = pd.read_csv(
"df_atp.csv", index_col=0)
df = pd.read_csv(
"data_4q .csv", index_col=0)
df.drop(df.columns[13:40],axis=1,inplace=True)
df.drop(["PSL",'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL','AvgW', 'AvgL'],axis=1,inplace=True)
df.columns
Lets take a good look at what our data is about using this super cool component called Facets (take a look at the docs : https://pair-code.github.io/facets/ )! You can explore your data in very different ways and we'll use it for 2 purposes:
# Create the feature stats for the datasets and stringify it.
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': df_atp}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML
HTML_TEMPLATE = """
<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
<facets-overview id="elem"></facets-overview>
<script>
document.querySelector("#elem").protoInput = "{protostr}";
</script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))
From the above charts we can already observe:
And how the surfaces are spread along the Series? Court? And by tournaments? Check the Facet Dives below for the answers
The prepared data used here can be viewed at Tennis Part 2: Modelization & Prediction: https://www.kaggle.com/danielfmfurlan/tennis
Getting the most correlated features
cor = df.corr()
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
plt.figure(figsize=(28,20))
import numpy as np
mask = np.zeros_like(cor)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(28, 20))
ax = sns.heatmap(cor, center = 0, linewidth = 0.9, vmin = -1, vmax = 1,
cmap = sns.color_palette("RdBu_r", 7),annot = False, mask=mask, square=True, fmt='.g')
corr_m = cor.abs()
sol = (corr_m.where(np.triu(np.ones(corr_m.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
print("les 10 variables qui ont une plus forte correlation : \n",sol[:10])
The Facest Dive is an interesting tool to visualize data in a friendly way:
# Display the Dive visualization for the training data.
from IPython.core.display import display, HTML
jsonstr = df.to_json(orient='records')
HTML_TEMPLATE = """
<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
<facets-dive id="elem" height="600"></facets-dive>
<script>
var data = {jsonstr};
document.querySelector("#elem").data = data;
</script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr)
display(HTML(html))